﻿# 该程序是用来进行数据清理的，将数据库中有记录但在本地无pdf文件的数据删除

import os
import pymysql
import re
import redis

# 这是mysql数据库的连接属性
MYSQL_CONNECT = {
    'MYSQL_HOST': '114.115.219.219',
    'MYSQL_PORT': 3306,
    'MYSQL_DB': 'globallegaldatabase',
    'MYSQL_USER': 'root',
    'MYSQL_PASSWD': 'xinanzhengfadaqxueapp03!',
    'MYSQL_CHARSET': 'utf8'
}
# 这是redis数据库信息
REDIS_CONNECT = {
    'REDIS_HOST': '114.115.219.219',
    'REDIS_PORT': 6379,
    'REDIS_PASSWD': 'CNKIcnki614',
    'REDIS_DB': '1'
}
# 连接数据库
host = MYSQL_CONNECT['MYSQL_HOST']
port = MYSQL_CONNECT['MYSQL_PORT']
db = MYSQL_CONNECT['MYSQL_DB']
user = MYSQL_CONNECT['MYSQL_USER']
password = MYSQL_CONNECT['MYSQL_PASSWD']
redis_host = REDIS_CONNECT['REDIS_HOST']
redis_port = REDIS_CONNECT['REDIS_PORT']
redis_passwd = REDIS_CONNECT['REDIS_PASSWD']
redis_db = REDIS_CONNECT['REDIS_DB']
MYSQL_TABLE = ['lawtext', 'lawcasetext']            # 这是数据库存储数据的表
REDIS_KEY = {'lawtext': 'Law', 'lawcasetext': 'Lawcase'}
pool = redis.ConnectionPool(host=redis_host, port=redis_port, password=redis_passwd, db=redis_db, decode_responses=True)
urlconn = pymysql.connect(host=host, port=port, db=db, user=user, passwd=password, charset='utf8')
SQL_COUNTRYS = ['LAWCOUNTRYTG']
# SQL_COUNTRYS = ['LAWCOUNTRYXJP', 'LAWCOUNTRYFLB', 'LAWCOUNTRYWL', 'LAWCOUNTRYYDNXY', 'LAWCOUNTRYMLXY',
#                 'LAWCOUNTRYTG', 'LAWCOUNTRYYN', 'LAWCOUNTRYLW', 'LAWCOUNTRYJPZ', 'LAWCOUNTRYMD']
for table in MYSQL_TABLE:
    for country in SQL_COUNTRYS:
        sysid = []
        # 创建游标
        urlcursor = urlconn.cursor()
        sql = "SELECT ID,FileUrl,DownLoadUrl,SYSID FROM %s WHERE SortA = '%s'" % (table, country)
        print(sql)
        try:
            urlcursor.execute(sql)
            rs = urlcursor.fetchall()
            for r in rs:
                r_l = r[1]
                res_tr = r'''(?<=/PDF).*'''  # 从数据库记录中获取PDF路径的正则表达式
                re_tr = re.findall(res_tr, r_l, re.S)  # 利用正则表达式获取PDF路径并存到链表
                re_a = re_tr[0]  # 取链表的第一个值
                pdf_path = 'D:\\GlobalLawFiles' + re_a
                state = os.path.exists(pdf_path)
                # 查看pdf是否存在
                if state:
                    pass
                else:
                    with open('D:/GlobalLawFiles/nopdf.txt', 'a', encoding='utf-8') as f:
                        f.write(str(r[0]) + ', ' + r[1] + ', ' + r[2] + ', ' + r[3] + '\n')
                    # sysid.append(r[3])

        except Exception as e:
            print('mysql查找数据异常：' + str(e))
        urlcursor.close()
        # 创建删除数据游标
        urlcursor1 = urlconn.cursor()
        for sys in sysid:
            rediscursor = redis.Redis(connection_pool=pool)
            redis_key = REDIS_KEY[table]
            sqll = "DELETE FROM %s WHERE SortA = '%s' AND SYSID = '%s'" % (table, country ,sys)
            print(sqll)
            try:
                urlcursor1.execute(sqll)
                try:
                    rediscursor.srem(redis_key, sys)
                    urlconn.commit()
                except Exception as e:
                    print('Redis删除数据异常：' + str(e))
            except Exception as e:
                print('mysql删除数据异常：' + str(e))
        urlcursor1.close()

urlconn.close()
with open('D:/GlobalLawFiles/nopdf.txt', 'a', encoding='utf-8') as f:
    f.write('清洗结束\n')


